cd "C:\Users\remij\Desktop\Replication Files JOEG\Stata"
set more off

*********************
*********************
*** MAIN DATA SET ***
*********************
*********************

*********
* YEARS *
*********

clear
import excel "years.xlsx", sheet("Sheet1") firstrow clear
keep if year >= 1751 & year <= 1932
count
* 182
sort year
save years, replace

****************************************
* JEDWAB & MORADI 2016 MAIN DATA SET *
****************************************

* Main data set on Ghana from Jedwab & Moradi 2016
* Jedwab, Remi and Alexander Moradi, “The Permanent Effects of Transportation Revolutions in Poor Countries: Evidence from Africa,” Review of Economics and Statistics, 2016, 98 (2), 268–284.
use ghana1, clear
* Keep one observation by cell
keep if year == 1891 
drop year
* The variables that we don't need *
drop rail18_10-railstat18_40 tk_10-dist2linepost18 dist2apamok-dist2iv upop1666_2000-upop2000_2000 paved* improved* earthern* church* school* minvalue* mintype* minprod* 
drop gvt1901-educ_sss_sh
count
* 2,091 cells (includes the North)
cross using years
count
* 2,091 cells x 182 years (1751-1932) = 380,562 observations
order gridcell year
label var year "Year"
gen gridcellyear = gridcell + string(year)
save ghana_missions1, replace
count
* 380,562

***********************************************
* ADDITIONAL INFORMATION FROM THE 1901 CENSUS *
***********************************************

* Additional information from the 1901 census. 
* The file was created for Jedwab & Moradi 2016.
use popcensus_1901, clear
sort gridcell
save popcensus_1901, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using popcensus_1901
tab _m
drop _m
save ghana_missions1, replace
count
* 380,562

********************
* MISSION DATA SET *
********************

*** PRIESTS DATA ***

use "church_grid0_12072017\church_grid0", clear
keep gridcell year eur* afr*
keep if year == 1752
replace year = 1751 if year == 1752
save church_grid0priests_1751, replace

use "church_grid0_12072017\church_grid0", clear
append using church_grid0priests_1751
sort gridcell year
save church_grid0priests_all, replace
count
* 380,562
capture erase "church_grid0priests_1751.dta"

*** CHURCH DATA SET ***

use "church_grid0_12072017\church_grid0", clear
keep if year == 1752
replace year = 1751 if year == 1752
save church_grid0_1751, replace

use "church_grid0_12072017\church_grid0", clear
append using church_grid0_1751
*** Missions ***
egen missions_num = rsum(cmeth cpresb ccath coth)
gen missions_yn = (missions_num > 0 & missions_num != 0)
tab missions_yn
*** Main stations ***
egen mainstat_num = rsum(cmeth_main cpresb_main ccath_main coth_main)
gen mainstat_yn = (mainstat_num > 0 & mainstat_num != 0)
tab mainstat_yn
*** Missionaries ***
sort gridcell year
save church_grid0_2, replace
count
* 380,562
capture erase "church_grid0_1751.dta"

* We add  to the main data set. 
use ghana_missions1, clear
sort gridcell year
merge gridcell year using church_grid0_2
tab _m
tab gridcell if _m == 2
drop if _m == 2
drop _m
sort gridcell year
merge gridcell year using church_grid0priests_all
tab _m
tab gridcell if _m == 2
drop if _m == 2
drop _m
save ghana_missions1, replace
count
* 380,562
capture erase "church_grid0_2.dta"
capture erase "church_grid0priests_all.dta"

*************
* REGION FE *
*************

* 4 Region FE
use partition, clear
sort gridcell
save partition, replace

use ghana_missions1, clear
sort gridcell 
merge gridcell using partition
tab _m
drop _m
save ghana_missions1, replace

************************
* SPHERES OF INFLUENCE *
************************

* This data comes from a 1888 map of the spheres of influence.
use dist2spheres.dta, clear
sort gridcell
save dist2spheres2, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using dist2spheres2
tab _m
drop _m
save ghana_missions1, replace

*****************************
* HISTORICAL MALARIA BURDEN *
*****************************

clear
insheet using "Grid_0.1x0.1_Ghana_2091_dcw_corrected.csv"
ren histo_malaria_mean histo_malaria
label var histo_malaria "Historical malaria burden - S gene frecuency"
drop *count
sort gridcell
save malaria_historical, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using malaria_historical
tab _m
drop _m
save ghana_missions1, replace

******************************
* LYSENKO MALARIA ENDEMICITY *
******************************

clear
insheet using "Grid_0.1x0.1_Ghana_2091_lysenko.csv"
ren lysenko lysenko_malaria
label var lysenko_malaria "Malaria endemicity (1900)"
sort gridcell
save malaria_lysenko, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using malaria_lysenko
tab _m
drop _m
save ghana_missions1, replace

*************************************
* KISZWESKI MALARIA STABILITY INDEX *
*************************************

clear
insheet using "Grid_0.1x0.1_Ghana_2091_Kiszweski.csv"
ren malaria_mean kiszweski_malaria
label var kiszweski_malaria "Malaria stability index (2004)"
drop *count
sort gridcell
save malaria_lysenko, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using malaria_lysenko
tab _m
drop _m
save ghana_missions1, replace

**********************************************
* HISTORICAL TSETSE SUITABILITY INDEX (1871) *
**********************************************

clear
insheet using "Grid_0.1x0.1_Ghana_2091_alsan.csv"
label var tsi "Tsetse suitability index (1871)"
sort gridcell
save tsi, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using tsi
tab _m
drop _m
save ghana_missions1, replace

**********************************
* TSETSE INDICES FROM FAO (2000) *
**********************************

clear
insheet using "Grid_0.1x0.1_Ghana_2091_tsetse.csv"
label var fusca_mean "Predicted percentage probability of presence: Fuscas"
label var morsitans_mean "Predicted percentage probability of presence: Morsitans"
label var palpalis_mean "Predicted percentage probability of presence: Palpalis"
keep gridcell *mean
sort gridcell
save tsetse, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using tsetse
tab _m
drop _m
save ghana_missions1, replace

*********************************************
* PROBABILITY OF OCCURENCE OF DENGUE (2010) *
*********************************************

clear
insheet using "Grid_0.1x0.1_Ghana_2091_dengue.csv"
label var dengue_mean "Probability of occurence of dengue (2010)"
drop *count
sort gridcell
save dengue, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using dengue
tab _m
drop _m
save ghana_missions1, replace

*************
* RAILROADS *
*************

* The file was created for Jedwab & Moradi 2016.
use raildist, clear
ren dist dist2rail
label var dist2rail "Distance to railroad line (km) in year t"
sort gridcell year
save raildist2, replace

use ghana_missions1, clear
sort gridcell year
merge gridcell year using raildist2
tab _m
drop _m
gen dist2rail_yn = 0
replace dist2rail_yn = 1 if dist2rail != .
save ghana_missions1, replace
count
* 380,562

*****************
* SLAVE MARKETS *
*****************

* The sources are Perbi (2004) and Anquandah (2013). 
clear
import delimited "slaveroutetowns_all_final.csv"
keep if type == "Market"
bysort gridcell: keep if _n == 1
keep gridcell 
gen slavemarket_strict = 1
save slavemarket_strict, replace
clear
import delimited slaveroutetowns_all_final.csv
bysort gridcell: keep if _n == 1
keep gridcell 
gen slavemarket_broad = 1
save slavemarket_broad, replace

use ghana_missions1, clear
sort gridcell
merge gridcell  using slavemarket_strict
tab _m
drop _m
replace slavemarket_strict = 0 if slavemarket_strict == .
label var slavemarket_strict "Slave market in the cell (not incl. castles and forts)"
sort gridcell
merge gridcell  using slavemarket_broad
tab _m
drop _m
replace slavemarket_broad = 0 if slavemarket_broad == .
label var slavemarket_broad "Slave market in the cell (incl. castles and forts)"
save ghana_missions1, replace
count
* 380,562

* Distance: If "strict"
use ghana_missions1, clear
keep if year == 1932
keep if slavemarket_strict == 1
ren gridcell slavecell
ren longitude slave_lon
ren latitude slave_lat
save slave_points_temp, replace

use ghana_missions1, clear
keep if year == 1932
keep gridcell longitude latitude
ren longitude cell_lon
ren latitude cell_lat
cross using slave_points_temp
geodist cell_lat cell_lon slave_lat slave_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist slavemarket_strict_dist
save slavedist_temp, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using slavedist_temp
tab _m
drop _m
label var slavemarket_strict_dist "Euclidean distance to a slave market - strict"
save ghana_missions1, replace

* Distance: If "broad"
use ghana_missions1, clear
keep if year == 1932
keep if slavemarket_broad == 1
ren gridcell slavecell
ren longitude slave_lon
ren latitude slave_lat
save slave_points_temp, replace

use ghana_missions1, clear
keep if year == 1932
keep gridcell longitude latitude
ren longitude cell_lon
ren latitude cell_lat
cross using slave_points_temp
geodist cell_lat cell_lon slave_lat slave_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist slavemarket_broad_dist
save slavedist_temp, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using slavedist_temp
tab _m
drop _m
label var slavemarket_broad_dist "Euclidean distance to a slave market - broad"
save ghana_missions1, replace

****************
* SLAVE ROUTES *
****************

* Dummy: The source is Perbi (2004)
clear
import delimited slave_routes_peki_cells.csv, clear 
ren description gridcell
keep gridcell
gen slave_route_perbi_yn = 1
sort gridcell
save slave_route_perbi_yn, replace
use ghana_missions1, clear
sort gridcell
merge gridcell using slave_route_perbi_yn
tab _m
drop _m
replace slave_route_perbi_yn = 0 if slave_route_perbi_yn == .
label var slave_route_perbi_yn "Slave route in the cell (Perbi 2004)"
*tab slave_route_perbi_yn if year == 1932
* 437
save ghana_missions1, replace

* Dummy: The source is Anquandah (2013)
clear
import delimited slave_routes_anquandah_cells.csv, clear 
ren description gridcell
keep gridcell
gen slave_route_anq_yn = 1
sort gridcell
save slave_route_anq_yn, replace
use ghana_missions1, clear
sort gridcell
merge gridcell using slave_route_anq_yn
tab _m
drop _m
replace slave_route_anq_yn = 0 if slave_route_anq_yn == .
label var slave_route_anq_yn "Slave route in the cell (Anquandah 2013)"
tab slave_route_anq_yn if year == 1932
save ghana_missions1, replace

* Distance: The source is Perbi (2004)
clear
import delimited slave_routes_peki_cells.csv, clear 
count
ren description slavecell
ren longitude slave_lon
ren latitude slave_lat
save slave_points_temp, replace

use ghana_missions1, clear
keep if year == 1932
keep gridcell longitude latitude
ren longitude cell_lon
ren latitude cell_lat
cross using slave_points_temp
geodist cell_lat cell_lon slave_lat slave_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist slave_route_perbi_dist
save slavedist_temp, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using slavedist_temp
tab _m
drop _m
label var slave_route_perbi_dist "Euclidean distance to slave route (Perbi 2004)"
save ghana_missions1, replace

* Distance: The source is Anquandah (2013)
clear
import delimited slave_routes_anquandah_cells.csv, clear 
count
ren description slavecell
ren longitude slave_lon
ren latitude slave_lat
save slave_points_temp, replace

use ghana_missions1, clear
keep if year == 1932
keep gridcell longitude latitude
ren longitude cell_lon
ren latitude cell_lat
cross using slave_points_temp
geodist cell_lat cell_lon slave_lat slave_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist slave_route_anq_dist
save slavedist_temp, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using slavedist_temp
tab _m
drop _m
label var slave_route_anq_dist "Euclidean distance to slave route (Anquandah 2004)"
save ghana_missions1, replace
count
* 380,562

***************
* PLANTATIONS *
***************

* Plantations 1900 (Dickson p.149)
clear
import delimited plantations_1900.csv, clear 
gen plantation = 1
collapse (max) plantation coconut-banana, by(gridcell)
ren oil_palm palmoil
ren palm_pol_f palmoil_factory
replace palmoil = 1 if palmoil_factory == 1
drop palmoil_factory
foreach X in coconut rubber palmoil cocoa coffee banana  {
ren `X' plant_`X'
}
order gridcell plantation plant_*
sort gridcell
save plantations_1900, replace
use ghana_missions1, clear
sort gridcell
merge gridcell using plantations_1900
tab _m
drop _m
foreach X of varlist plantation plant_* {
replace `X' = 0 if `X' == .
label var `X' "Dummy if `X' in the cell (1900)"
*tab `X' if year == 1932
}
save ghana_missions1, replace

* Distance to plantation
use ghana_missions1, clear 
keep if year == 1932
keep if plantation == 1
ren gridcell plantacell
ren longitude planta_lon
ren latitude planta_lat
save planta_points_temp, replace

use ghana_missions1, clear
keep if year == 1932
keep gridcell longitude latitude
ren longitude cell_lon
ren latitude cell_lat
cross using planta_points_temp
geodist cell_lat cell_lon planta_lat planta_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist plantation1900_dist
save plantadist_temp, replace
use ghana_missions1, clear
sort gridcell
merge gridcell using plantadist_temp
tab _m
drop _m
label var plantation1900_dist "Euclidean distance to a plantation in 1900 (Dickson)"
save ghana_missions1, replace

* Distance to palm oil
use ghana_missions1, clear 
keep if year == 1932
keep if plant_palmoil == 1
ren gridcell plantacell
ren longitude planta_lon
ren latitude planta_lat
save planta_points_temp, replace

use ghana_missions1, clear
keep if year == 1932
keep gridcell longitude latitude
ren longitude cell_lon
ren latitude cell_lat
cross using planta_points_temp
geodist cell_lat cell_lon planta_lat planta_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist palmoil1900_dist
save plantadist_temp, replace
use ghana_missions1, clear
sort gridcell
merge gridcell using plantadist_temp
tab _m
drop _m
label var palmoil1900_dist "Euclidean distance to a palm oil plantation in 1900 (Dickson)"
save ghana_missions1, replace

* Distance to rubber
use ghana_missions1, clear 
keep if year == 1932
keep if plant_rubber == 1
ren gridcell plantacell
ren longitude planta_lon
ren latitude planta_lat
save planta_points_temp, replace
use ghana_missions1, clear
keep if year == 1932
keep gridcell longitude latitude
ren longitude cell_lon
ren latitude cell_lat
cross using planta_points_temp
geodist cell_lat cell_lon planta_lat planta_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist rubber1900_dist
save plantadist_temp, replace
use ghana_missions1, clear
sort gridcell
merge gridcell using plantadist_temp
tab _m
drop _m
label var rubber1900_dist "Euclidean distance to a rubber plantation in 1900 (Dickson)"
save ghana_missions1, replace

* Danish plantations
clear
import delimited danish_plantation_v2.csv, clear 
keep gridcell
gen danishplantation = 1
sort gridcell
save danishplantation, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using danishplantation
tab _m
drop _m
replace danishplantation = 0 if danishplantation == .
label var danishplantation "Dummy if 19th century Danish plantation in the cell"
save ghana_missions1, replace
count
* 380,562

****************
* MODERN MINES *
****************

* Simple dummies *
* See the text for details. 
clear
import excel "mines_until_1932.xlsx", sheet("Sheet1") firstrow clear
ren Year year
gen gold_yn = (Type == "Gold_ounces" & Quantity > 0 & Quantity != .)
gen diam_yn = (Type == "Diamonds_carats" & Quantity > 0 & Quantity != .)
gen mang_yn = (Type == "Manganese_tons" & Quantity > 0 & Quantity != .)
gen mine_yn = (Quantity > 0 & Quantity != .)
collapse (max) *_yn, by(gridcell year)
sort gridcell year
save modernmines, replace
use ghana_missions1, clear
sort gridcell year
merge gridcell year using modernmines
tab _m
drop _m
foreach X of varlist gold_yn-mine_yn {
replace `X' = 0 if `X' == .
label var `X' "Dummy if mine `X' in the cell in year t"
}
* 1878 first year for Tarkwa (P62)
foreach X of varlist gold_yn mine_yn {
replace `X' = 1 if gridcell == "P62" & `X' == 0 & year >= 1878 & year <= 1900
}
save ghana_missions1, replace

* Distance to a mine (3 gold, 1 manganese, 1 diamonds) 1932 *
foreach X of numlist 1878(1)1932 {
use ghana_missions1, clear
keep if year == `X'
keep if mine_yn == 1
keep gridcell longitude latitude
ren gridcell minecell
ren longitude mine_lon
ren latitude mine_lat
save mine_points_temp, replace
use ghana_missions1, clear
keep if year == 1932
keep gridcell longitude latitude
ren longitude cell_lon
ren latitude cell_lat
cross using mine_points_temp
geodist cell_lat cell_lon mine_lat mine_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist minet_dist
gen year = `X'
sort gridcell year
save minedist_temp, replace
use ghana_missions1, clear
sort gridcell year
merge gridcell year using minedist_temp, update
tab _m
drop _m
save ghana_missions1, replace
}
label var minet_dist "Euclidean distance to a modern mine in t"
save ghana_missions1, replace
count
* 380,562

* Distance to a gold mine (3 gold: Obuasi, Prestea, Tarkwa) 1932 *
foreach X of numlist 1878(1)1932 {
use ghana_missions1, clear
keep if year == `X'
keep if gold_yn == 1
keep gridcell longitude latitude
ren gridcell minecell
ren longitude mine_lon
ren latitude mine_lat
save mine_points_temp, replace
use ghana_missions1, clear
keep if year == 1932
keep gridcell longitude latitude
ren longitude cell_lon
ren latitude cell_lat
cross using mine_points_temp
geodist cell_lat cell_lon mine_lat mine_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist goldminet_dist
gen year = `X'
sort gridcell year
save minedist_temp, replace
use ghana_missions1, clear
sort gridcell year
merge gridcell year using minedist_temp, update
tab _m
drop _m
save ghana_missions1, replace
}
label var goldminet_dist "Euclidean distance to a modern gold mine in t"
save ghana_missions1, replace
count
* 380,562

*********************
* FORTS AND CASTLES *
*********************

* Simple dummies *
* See text for details on the sources.
clear
import delimited forts_castles_vfinal.csv
gen castlefort = 1
collapse (sum) castlefort_num = castlefort (max) castlefort_yn = castlefort, by(gridcell)
sort gridcell
save forts_castles_vfinal, replace
* We could also use the information on the year of creation and the identity of the colonizer

use ghana_missions1, clear
sort gridcell 
merge gridcell using forts_castles_vfinal
tab _m
drop _m
foreach X of varlist castlefort* {
replace `X' = 0 if `X' == .
}
label var castlefort_num "Number of castles or forts in the cell"
label var castlefort_yn "Dummy if castle or fort in the cell"
save ghana_missions1, replace
count
* 380,562

****************
* TRADING POST *
****************

* Simple dummies *
* See text for details on the sources.
clear
import delimited trading_post_1700.csv
gen tradepost1700 = 1
collapse (sum) tradepost1700_num = tradepost1700 (max) tradepost1700_yn = tradepost1700, by(gridcell)
sort gridcell
save tradepost1700, replace
* We could also use the information on the year of creation and the identity of the colonizer

use ghana_missions1, clear
sort gridcell 
merge gridcell using tradepost1700
tab _m
drop _m
foreach X of varlist tradepost1700* {
replace `X' = 0 if `X' == .
}
label var tradepost1700_num "Number of tradepost1700 in the cell"
label var tradepost1700_yn "Dummy if tradepost1700 in the cell"
save ghana_missions1, replace
count
* 380,562

* Distance
use ghana_missions1, clear
keep if year == 1932
keep if tradepost1700_yn == 1
keep gridcell longitude latitude
ren gridcell tradepostcell
ren longitude tradepost_lon
ren latitude tradepost_lat
save tradepost_points_temp, replace

use ghana_missions1, clear
keep if year == 1932
keep gridcell longitude latitude
ren longitude cell_lon
ren latitude cell_lat
cross using tradepost_points_temp
geodist cell_lat cell_lon tradepost_lat tradepost_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist tradepost_dist
save tradepostdist_temp, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using tradepostdist_temp
tab _m
drop _m
label var tradepost_dist "Euclidean distance to a tradepost1700"
save ghana_missions1, replace
count
* 380,562

**************
* PORTS 1702 *
**************

* Simple dummies *
* See text for details on the sources.
clear
import delimited port_1702_vfinal.csv
gen port1702 = 1
collapse (sum) port1702_num = port1702 (max) port1702_yn = port1702, by(gridcell)
sort gridcell
save port1702, replace
* We could also use the information on the year of creation and the identity of the colonizer

use ghana_missions1, clear
sort gridcell 
merge gridcell using port1702
tab _m
drop _m
foreach X of varlist port1702* {
replace `X' = 0 if `X' == .
}
label var port1702_num "Number of port1702 in the cell"
label var port1702_yn "Dummy if port1702 in the cell"
save ghana_missions1, replace
count
* 380,562

***************************************
* PORTS 1850 1900 1910 1920 1925 1931 *
***************************************

* Simple dummies *
* See text for details on the sources.
foreach X in 1850 1900 1910 1920 1925 1931 {
clear
import delimited port_`X'.csv
gen port`X' = 1
collapse (sum) port`X'_num = port`X' (max) port`X'_yn = port`X', by(gridcell)
sort gridcell
save port`X', replace
* We could also use the information on the year of creation and the identity of the colonizer
use ghana_missions1, clear
sort gridcell 
merge gridcell using port`X'
tab _m
drop _m
foreach Y of varlist port`X'* {
replace `Y' = 0 if `Y' == .
}
label var port`X'_num "Number of port in `X' in the cell"
label var port`X'_yn "Dummy if port in `X' in the cell"
save ghana_missions1, replace
}
count
* 380,562

foreach X in 1850 1900 1910 1920 1925 1931 {
clear

* Distance
use ghana_missions1, clear
keep if year == 1932
keep if port`X'_yn == 1
keep gridcell longitude latitude
ren gridcell portcell
ren longitude port_lon
ren latitude port_lat
save port_points_temp, replace

use ghana_missions1, clear
keep if year == 1932
keep gridcell longitude latitude
ren longitude cell_lon
ren latitude cell_lat
cross using port_points_temp
geodist cell_lat cell_lon port_lat port_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist port`X'_dist
save port_temp, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using port_temp
tab _m
drop _m
label var port`X'_dist "Euclidean distance to a port existing in `X'"
save ghana_missions1, replace
count
* 380,562

}

*********************
* TRADE ROUTES 1850 *
**********************

* Simple dummies *
* See text for details on the sources.
clear
import delimited trade_routes18501890_cells.csv
keep description
ren description gridcell
gen traderoute18501890 = 1
sort gridcell
save traderoutes18501890, replace
use ghana_missions1, clear
sort gridcell 
merge gridcell using traderoutes18501890
tab _m
drop _m
foreach X of varlist traderoute18501890 {
replace `X' = 0 if `X' == .
}
label var traderoute18501890 "Dummy if trade route in 1850-1890 in the cell"
save ghana_missions1, replace

* Distance

use ghana_missions1, clear
keep if year == 1932
keep if traderoute18501890 == 1
keep gridcell longitude latitude
ren gridcell traderoutecell
ren longitude traderoute_lon
ren latitude traderoute_lat
save traderoute_points_temp, replace

use ghana_missions1, clear
keep if year == 1932
keep gridcell longitude latitude
ren longitude cell_lon
ren latitude cell_lat
cross using traderoute_points_temp
geodist cell_lat cell_lon traderoute_lat traderoute_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist traderoute18501890_dist
save traderoutedist_temp, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using traderoutedist_temp
tab _m
drop _m
label var traderoute18501890_dist "Euclidean distance to a trade route in 1850-1890"
save ghana_missions1, replace
count
* 380,562

*********************
* TRADE ROUTES 1702 *
*********************

* Simple dummies *
* See text for details on the sources.
clear
import delimited trade_routes1702_new_cells.csv
keep description
ren description gridcell
gen traderoute1702 = 1
sort gridcell
save traderoutes1702, replace
use ghana_missions1, clear
sort gridcell 
merge gridcell using traderoutes1702
tab _m
drop _m
foreach X of varlist traderoute1702 {
replace `X' = 0 if `X' == .
}
label var traderoute1702 "Dummy if trade route in 1702 in the cell"
save ghana_missions1, replace

* Distance
use ghana_missions1, clear
keep if year == 1932
keep if traderoute1702 == 1
keep gridcell longitude latitude
ren gridcell traderoutecell
ren longitude traderoute_lon
ren latitude traderoute_lat
save traderoute_points_temp, replace

use ghana_missions1, clear
keep if year == 1932
keep gridcell longitude latitude
ren longitude cell_lon
ren latitude cell_lat
cross using traderoute_points_temp
geodist cell_lat cell_lon traderoute_lat traderoute_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist traderoute1702_dist
save traderoutedist_temp, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using traderoutedist_temp
tab _m
drop _m
label var traderoute1702_dist "Euclidean distance to a trade route in 1702"
save ghana_missions1, replace
count
* 380,562

***************************
* JOURNEYS OF EXPLORATION *
***************************

*** All ***

* Simple dummies *
* See text for details on the sources.
clear
import delimited journeys_cells.csv
keep gridcell
gen exploroute188089 = 1
sort gridcell
save exploroute188089, replace
use ghana_missions1, clear
sort gridcell 
merge gridcell using exploroute188089
tab _m
drop _m
foreach X of varlist exploroute188089 {
replace `X' = 0 if `X' == .
}
label var exploroute188089 "Dummy if explorer route in 1880-1899 in the cell"
save ghana_missions1, replace

* Distance
use ghana_missions1, clear
keep if year == 1932
keep if exploroute188089
keep gridcell longitude latitude
ren gridcell exploroutecell
ren longitude exploroute_lon
ren latitude exploroute_lat
save exploroute_points_temp, replace

use ghana_missions1, clear
keep if year == 1932
keep gridcell longitude latitude
ren longitude cell_lon
ren latitude cell_lat
cross using exploroute_points_temp
geodist cell_lat cell_lon exploroute_lat exploroute_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist exploroute188089_dist
save exploroutedist_temp, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using exploroutedist_temp
tab _m
drop _m
label var exploroute188089_dist "Euclidean distance to an explorer route in 1880-1899"
save ghana_missions1, replace
count
* 380,562

*** Excluding Reverend Ramseyer ***

* Simple dummies *
clear
import delimited journeys_cells.csv
drop if explo_name == "Ramseyer"
keep gridcell
gen exploroute188089m = 1
sort gridcell
save exploroute188089m, replace

use ghana_missions1, clear
sort gridcell 
merge gridcell using exploroute188089m
tab _m
drop _m
foreach X of varlist exploroute188089m {
replace `X' = 0 if `X' == .
}
label var exploroute188089m "Dummy if military explorer route in 1880-1899 in the cell"
save ghana_missions1, replace

* Distance
use ghana_missions1, clear
keep if year == 1932
keep if exploroute188089m
keep gridcell longitude latitude
ren gridcell exploroutecell
ren longitude exploroutem_lon
ren latitude exploroutem_lat
save exploroutem_points_temp, replace

use ghana_missions1, clear
keep if year == 1932
keep gridcell longitude latitude
ren longitude cell_lon
ren latitude cell_lat
cross using exploroutem_points_temp
geodist cell_lat cell_lon exploroutem_lat exploroutem_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist exploroute188089m_dist
save exploroutedistm_temp, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using exploroutedistm_temp
tab _m
drop _m
label var exploroute188089m_dist "Euclidean distance to military explorer route in 1880-1899"
save ghana_missions1, replace
count
* 380,562

******************************************************
* ADDITIONAL EXPLANATORY VARIABLES CREATED USING GIS *
******************************************************

* Created in the folder "GIS_explanatories"
use "GIS_explanatories\GIS_explanatories", clear
sort gridcell
save GIS_explanatories, replace

use ghana_missions1, clear
sort gridcell 
merge gridcell using GIS_explanatories
tab _m
drop _m
saveold ghana_missions1, replace
count
* 380,562
capture erase "GIS_explanatories.dta"

**********************************
* WE CREATE ADDITIONAL VARIABLES *
**********************************

use ghana_missions1, clear
count
* 380,562

*** Additional controls ***

* Log of some variables *
gen lupop_1931 = log(upop_1931+1)
gen lrpop_1931 = log(rpop_1931+1)
gen lcocoa_prod27 = log(cocoa_prod27+1)
gen lupop_1901 = log(upop_1901+1)
gen lrpop_1901 = log(rpop_1901+1) if rpop_1901 != .
gen lupop_1891 = log(upop_1891+1)
* Dummies for the top cities *
drop kumasi
gen accra = (gridcell == "AG60")
gen kumasi = (gridcell == "S49")
gen sekondi = (gridcell == "R66")
gen top3 = (accra == 1 | kumasi == 1 | sekondi == 1)

*** Mission schools variables *** 
egen asstschools_num = rsum(s3meth s3presb s3cath s3oth)
sort gridcell year
gen asstschools_yn = (asstschools_num > 0 & asstschools_num != 0)

*** Denominations variables ***
gen metho_yn = (cmeth > 0 & cmeth != 0)
gen presb_yn = (cpresb > 0 & cpresb != 0)
gen catho_yn = (ccath > 0 & ccath != 0)
gen other_yn = (coth > 0 & coth != 0)

*** Exits variables *** 
egen missionsexit_num = rsum(exitmeth exitpresb exitcath exitoth)
sort gridcell year
gen missionsexit_yn = (missionsexit_num > 0 & missionsexit_num != 0)

save ghana_missions1, replace

** Distance to 1st Presbyterian **

* Presbyterian 1828 in Christiansborg (Accra)
use ghana_missions1, clear
keep if cpresb == 1 
sum year
tab gridcell if year == 1828
use ghana_missions1, clear
keep if year == 1932
gen firstpresby = (gridcell == "AG60")
keep if first == 1
ren longitude first_long 
ren latitude first_lat
keep first*
save first_temp, replace

use ghana_missions1, clear
keep if year == 1932
ren longitude cell_long 
ren latitude cell_lat
keep gridcell cell_long cell_lat
cross using first_temp
geodist cell_lat cell_lon first_lat first_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist dist2firstpresb
sort gridcell
save dist2firstpresb, replace

** Distance to 1st Methodist **

use ghana_missions1, clear
keep if cmeth == 1
sum year
* 1835
tab gridcell if year == 1835

* We use Methodist 1835 Elmina 
use ghana_missions1, clear
keep if year == 1932
gen firstmetho = (gridcell == "V65")
keep if first == 1
ren longitude first_long 
ren latitude first_lat
keep first*
save first_temp, replace

use ghana_missions1, clear
keep if year == 1932
ren longitude cell_long 
ren latitude cell_lat
keep gridcell cell_long cell_lat
cross using first_temp
geodist cell_lat cell_lon first_lat first_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist dist2firstmetho
sort gridcell
save dist2firstmetho, replace

** Distance to 1st Catholic **

use ghana_missions1, clear
keep if ccath == 1
sum year
* 1880
tab gridcell if year == 1880

* Catholic 1880 Elmina
use ghana_missions1, clear
keep if year == 1932
gen firstcatho = (gridcell == "V65")
keep if first == 1
ren longitude first_long 
ren latitude first_lat
keep first*
save first_temp, replace

use ghana_missions1, clear
keep if year == 1932
ren longitude cell_long 
ren latitude cell_lat
keep gridcell cell_long cell_lat
cross using first_temp
geodist cell_lat cell_lon first_lat first_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist dist2firstcatho
sort gridcell
save dist2firstcatho, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using dist2firstpresb
tab _m
drop _m
sort gridcell
merge gridcell using dist2firstmetho
tab _m
drop _m
sort gridcell
merge gridcell using dist2firstcatho
tab _m
drop _m
save ghana_missions1, replace

*********************
* DISTANCE TO ABURI *
*********************

import excel "aburi.xlsx", sheet("Sheet1") firstrow clear
ren longitude aburi_lon
ren latitude aburi_lat
save aburi_points_temp, replace

use ghana_missions1, clear
keep if year == 1932
keep gridcell longitude latitude
ren longitude cell_lon
ren latitude cell_lat
cross using aburi_points_temp
geodist cell_lat cell_lon aburi_lat aburi_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist dist2aburi
save dist2aburi, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using dist2aburi
tab _m
drop _m
label var dist2aburi "Euclidean distance to Aburi"
save ghana_missions1, replace

********************************************************
* JEDWAB & MORADI (2015) WESTERN, PLACEBO AND IV LINES *
********************************************************

* We create more railroad variables. 
use ghana1, clear
* Keep one observation by cell
keep if year == 1891 
drop year
* The variables that we don't need *
keep gridcell dist2west18 dist2apamok dist2aok dist2ccpk dist2sok dist2kpong dist2iv 
count
* 2,091 cells (includes the North)
cross using years
count
* 2,091 cells x 182 years (1751-1932) = 380,562 observations
order gridcell year
label var year "Year"
gen gridcellyear = gridcell + string(year)
sort gridcellyear
save west_plac_iv, replace
count
* 380,562

use ghana_missions1, clear
sort gridcellyear
merge gridcellyear using west_plac_iv
tab _m
drop _m
* Dummies for the western line *
codebook dist2west18
foreach X in dist2west18 {
gen `X'_10 = (`X' <= 10)
gen `X'_20 = (`X' > 10 & `X' <= 20)
gen `X'_30 = (`X' > 20 & `X' <= 30)
gen `X'_40 = (`X' > 30 & `X' <= 40)
gen `X'_030 = (`X' <= 30)
gen `X'_040 = (`X' <= 40)
}
* Dummies for the placebo line *
codebook dist2apamok dist2aok dist2ccpk dist2sok dist2kpong
egen dist2placebo = rmin(dist2apamok dist2aok dist2ccpk dist2sok dist2kpong)
foreach X in dist2placebo {
gen `X'_10 = (`X' <= 10)
gen `X'_20 = (`X' > 10 & `X' <= 20)
gen `X'_30 = (`X' > 20 & `X' <= 30)
gen `X'_40 = (`X' > 30 & `X' <= 40)
gen `X'_030 = (`X' <= 30)
gen `X'_040 = (`X' <= 40)
}
* Dummies for the IV *
codebook dist2iv
foreach X in dist2iv {
gen `X'_10 = (`X' <= 10)
gen `X'_20 = (`X' > 10 & `X' <= 20)
gen `X'_30 = (`X' > 20 & `X' <= 30)
gen `X'_40 = (`X' > 30 & `X' <= 40)
gen `X'_030 = (`X' <= 30)
gen `X'_040 = (`X' <= 40)
}
save ghana_missions1, replace

*** EASTERN LINE ONLY ***

use ghana1, clear
* Keep one observation by cell
keep if year == 1891 
drop year
* The variables that we don't need *
egen dist2east31 = rmin(dist2east18 dist2line1823 dist2line2331)
keep gridcell dist2east18 dist2east31 
count
* 2,091 cells (includes the North)
cross using years
count
* 2,091 cells x 182 years (1751-1932) = 380,562 observations
order gridcell year
label var year "Year"
gen gridcellyear = gridcell + string(year)
sort gridcellyear
save east, replace
count
* 380,562

use ghana_missions1, clear
sort gridcellyear
merge gridcellyear using east
tab _m
drop _m
* Dummies for the Eastern line *
codebook dist2east18 dist2east31 
foreach X in dist2east18 dist2east31  {
gen `X'_10 = (`X' <= 10)
gen `X'_20 = (`X' > 10 & `X' <= 20)
gen `X'_30 = (`X' > 20 & `X' <= 30)
gen `X'_40 = (`X' > 30 & `X' <= 40)
gen `X'_030 = (`X' <= 30)
gen `X'_040 = (`X' <= 40)
}
save ghana_missions1, replace

**************************
* ETHNIC AND DISTRICT FE *
**************************

*** Ethnic FE ***

use ghana1, clear
keep if year == 2000
keep gridcell ethnic
codebook ethnic
sort gridcell
save ethnic, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using ethnic
tab _m
drop _m
save ghana_missions1, replace

*** District FE ***

import delimited Grid_0.1x0.1_Ghana_2091_census1931_v2.csv, clear 
gsort gridcell -area
bysort gridcell: keep if _n == 1
count
sort gridcell 
save district31, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using district31
tab _m
tab gridcell if _m == 1
replace district31 = "Kusasi" if gridcell == "AE4"
replace district31 = "Keta-Ada" if gridcell == "AT52"
codebook district31
drop _m
* 38
save ghana_missions1, replace

******************************
* DISTANCE TO RAILROAD NODES *
******************************

use ghana_missions1, clear
keep if year == 1932
* Accra: AG60; Sekond: R67; Kumasi: S49
gen node = (gridcell == "AG60" | gridcell == "R67" | gridcell == "S49")
keep if node == 1
keep gridcell longitude latitude
ren gridcell nodecell
ren longitude node_lon
ren latitude node_lat
save node_points_temp, replace

use ghana_missions1, clear
keep if year == 1932
keep gridcell longitude latitude
ren longitude cell_lon
ren latitude cell_lat
cross using node_points_temp
geodist cell_lat cell_lon node_lat node_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist node_dist
save nodedist_temp, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using nodedist_temp
tab _m
drop _m
label var node_dist "Euclidean distance to a railroad node"
replace node_dist = 5 if node_dist == 0
gen lnode_dist = log(node_dist)
save ghana_missions1, replace
count
 
**********************************************
* SOIL SUITABILITY FROM JEDWAB & MORADI 2016 *
**********************************************

* Soils for Africa
use "soils_africa.dta", clear
keep if country == "Ghana" | (longitude == -1.05 & latitude == 11.05)
sort longitude latitude
save soils_ghana, replace
count

use ghana_missions1, clear
sort longitude latitude
merge longitude latitude using soils_ghana
tab _m
* One cell is outside the GIS map, Y5
* We use the information from the closest cell, X5
drop _m
foreach X of varlist class1-sparseveg {
gen `X'X5 = `X' if gridcell == "X5"
egen `X'_X5 = max(`X'X5)
replace `X' = `X'_X5 if gridcell == "Y5"
drop `X'X5 `X'_X5
}
codebook class1-sparseveg
* ok
save ghana_missions1, replace

************************
* SOIL FERTILITY INDEX *
************************

* Soils for Africa
clear
import excel "soil5_gh.xlsx", sheet("soil5_gh") firstrow
sort gridcell
save soilfertiindex_ghana, replace
count

use ghana_missions1, clear
sort gridcell
merge gridcell using soilfertiindex_ghana
tab _m
drop _m
save ghana_missions1, replace

**************************************
* RAIN FROM JEDWAB & STOREYGARD 2021 *
**************************************

* Source: Rémi Jedwab, Adam Storeygard, The Average and Heterogeneous Effects of Transportation Investments: Evidence from Sub-Saharan Africa 1960–2010, Journal of the European Economic Association, 2021;, jvab027, https://doi.org/10.1093/jeea/jvab027
use "rain_adam.dta", clear
ren longitude longitude_adam
ren latitude latitude_adam
replace longitude_adam = round(longitude_adam,0.05)
replace latitude_adam = round(latitude_adam,0.05)
sort longitude_adam latitude_adam
save rain_adam2, replace

use ghana_missions1, clear
gen longitude_adam = longitude
gen latitude_adam = latitude
sort longitude_adam latitude_adam
merge longitude_adam latitude_adam using rain_adam2
tab _m
drop if _m == 2
drop _m
corr av_yr_pre prec_mean
codebook av_yr_pre prec_mean
save ghana_missions1, replace

*****************
* SLAVE EXPORTS *
*****************

use grid_slaveexports, clear
ren murdock_name ethnic
sort ethnic
save grid_slaveexports2, replace
count
* 34

use ghana_missions1, clear
sort ethnic
merge ethnic using grid_slaveexports2
tab _m
drop _m
save ghana_missions1, replace
desc

********
* HYDE *
********

use "grid_pop.dta", clear
corr mpop*
corr spop*
keep gridcell *1800 *1700
sort gridcell
save hyde, replace

use ghana_missions1, replace
sort gridcell
merge gridcell using hyde
tab _m
drop _m
corr spop1800 mpop1800
corr spop1700 mpop1700
gen density1800s = spop1800/area_sqkm
sum density1800s, d
* sum of population per sq km in 1800
gen density1800m = mpop1800/area_sqkm
sum density1800m, d
* sum of population (mean) per sq km in 1800
save ghana_missions1, replace

*********************
* CITIES 1400, 1800 *
*********************

* Data from Chandler:
* In 1400 no towns in Ghana.
* In 1800 there are two towns:
* Kumasi S49, Sanga (Tamale AA21), Yendi (AI21)
use ghana_missions1, replace
keep if year == 1932
keep if gridcell == "AA21" | gridcell == "S49" | gridcell == "AI21"
ren longitude city1800_lon
ren latitude city1800_lat
keep city1800_*
save city1800, replace

use ghana_missions1, clear
keep if year == 1932
keep gridcell longitude latitude
ren longitude cell_lon
ren latitude cell_lat
cross using city1800
geodist cell_lat cell_lon city1800_lat city1800_lon, gen(dist)
collapse (min) dist, by(gridcell)
ren dist dist2city1800
save dist2city1800, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using dist2city1800
tab _m
drop _m
gen city1800 = (gridcell == "AA21" | gridcell == "S49" | gridcell == "AI21")
save ghana_missions1, replace
 
************************
* ADDITIONAL VARIABLES *
************************

* National2: Accra / Cape Coast / Kumasi + neighboring cells
import delimited "national2.csv", varnames(1) clear 
gen national2 = 1
sort gridcell
save national2, replace

use ghana_missions1, clear
sort gridcell 
merge gridcell using national2
tab _m
drop _m
replace national2 = 0 if national2 == .

* Log number of missions (+1)
gen lmissions_num = log(missions_num+1)
gen lasstschools_num = log(asstschools_num+1)
gen lmainstat_num = log(mainstat_num+1)

* Variables needed for the long-difference regressions 
foreach X in 1850 1875 1897 1900 1924 {
foreach Y in missions_yn missions_num lmissions_num asstschools_yn asstschools_num lasstschools_num mainstat_yn mainstat_num {
gen `Y'`X'var = `Y' if year == `X'
bysort gridcell: egen `Y'`X' = max(`Y'`X'var)
drop `Y'`X'var
}
}

* National capitals
* Accra: AG60; Cape Coast (capital until 1877): W65; Kumasi: S49
gen national = (gridcell == "AG60" | gridcell == "W65" | gridcell == "S49")

* Population variables
* 1901 Census 
replace headchief08 = 0 if map08_yn == 0
replace lrpop_1901 = 0 if map08_yn == 0
* 2000 Urbanization
gen urbrate2000 = upop_2000/pop_2000*100
replace urbrate2000 = 0 if urbrate2000 == .

* Variables with percentage share that should be in 100
foreach X in histo_malaria palmoilbelt gold kola {
replace `X' = `X'*100
sum `X'
}

* One category only (class 1 + 2 + 3)
egen dist2class123_1930 = rmin(dist2class1_1930 dist2class2_1930 dist2class3_1930)

* Railroads 1903 and 1932
foreach X in 03 32 {
gen dist2rail`X' = dist2rail if year == 19`X'
bysort gridcell: egen dist2rail19`X' = max(dist2rail`X')
}

* In GCC or not
* Same boundaries in 1831 as in 1873
gen nongcc1873 = (gcc1831 == 0)

* Dummies and distances that are logged (and replace 5 km if < 5 km):
foreach X in dist2coast dist2riv_navigable slave_route_perbi_dist slave_route_anq_dist traderoute18501890_dist exploroute188089_dist dist2rail1932 dist2class3_1901 dist2class3_1930 dist2class2_1930 dist2class1_1930 dist2class123_1930 {
replace `X' = 5 if `X' < 5 & `X' != .
gen l`X' = log(`X')
gen `X'10 = (`X' <= 10)
}

* Non-distance variables that are logged (1 if 0).
sum alt_mean alt_sd av_yr_prec
foreach X in alt_mean alt_sd av_yr_prec {
replace `X' = 1 if `X' < 1 & `X' != .
gen l`X' = log(`X')
}

* Mines 1903 and 1932
foreach X in 03 32 {
foreach Y in mine_yn minet_dist {
gen `Y'`X' = `Y' if year == 19`X'
bysort gridcell: egen `Y'19`X' = max(`Y'`X')
} 
}

* Gold Mines 1903 and 1932
foreach X in 03 32 {
foreach Y in gold_yn goldminet_dist {
gen `Y'`X' = `Y' if year == 19`X'
bysort gridcell: egen `Y'19`X' = max(`Y'`X')
} 
}

* Cocoa Production (Tons) in 1901 and 1927
gen cocoa_prod01 = 0
replace cocoa_prod01 = 9 if gridcell == "AH57"
gen lcocoa_prod01 = log(cocoa_prod01+1)
gen cocoa = (cocoa_prod27 > 0)

* Kola, palm oil belt, gold = share of grid
foreach X in 0 1 10 50 {
gen kolanut`X' = (kolanut > `X')
gen palmoilbelt`X' = (palmoilbelt > `X')
gen gold`X' = (gold > `X')
}

* Rubber (plantation), palm oil (plantation), slave market = distance
foreach X in 10 25 50 100 {
gen rubber1900_dist`X' = (rubber1900_dist <= `X')
gen palmoil1900_dist`X' = (palmoil1900_dist <= `X')
gen slavemarket_strict_dist`X' = (slavemarket_strict_dist <= `X')
gen minet_dist1903_`X' = (minet_dist1903 <= `X')
gen minet_dist1932_`X' = (minet_dist1932 <= `X')
gen goldminet_dist1903_`X' = (goldminet_dist1903 <= `X')
gen goldminet_dist1932_`X' = (goldminet_dist1932 <= `X')
}

* Southern Dummy (lower than median latitude)
gen south = (latitude <= 7.85)

save ghana_missions1, replace

**********************************************
*** VARIABLES FOR ANALYSIS ON MISSIONARIES ***
**********************************************

use ghana_missions1, clear
sort gridcell year
* Methodist + prebyterian missions
gen methmissions_yn = (cmeth >= 1) 
gen presbmissions_yn = (cpresb >= 1) 
label var methmissions_yn "Methodist mission"
label var presbmissions_yn "Presbyterian mission"
* Protestant mission dummy and number
gen promissions_yn = (methmissions_yn == 1 | presbmissions_yn == 1)
gen promissions_num = (cmeth+cpresb)
label var promissions_yn "Protestant mission"
label var promissions_num "Number of Protestant missions"
* European vs. African missionaries
gen euromissions_yn = ((eur_meth >= 1 & eur_meth != .) | (eur_presb >= 1 & eur_presb != .))
gen afrmissions_yn = ((afr_meth >= 1 & afr_meth != .) | (afr_presb >= 1 & afr_presb != .))
gen euroafrmissions_yn = (euromissions_yn == 1 | afrmissions_yn == 1)
label var euromissions_yn "Euro. missionary"
label var afrmissions_yn "Afri. missionary"
label var euroafrmissions_yn "Euro. or Afri. missionary"
* We assume there is an African missionary if mission but no European missionary.
gen afrmissions2_yn = afrmissions_yn
replace afrmissions2_yn = 1 if promissions_yn == 1 & euroafrmissions_yn == 0
gen euroafrmissions2_yn = (euromissions_yn == 1 | afrmissions2_yn == 1)
label var afrmissions2_yn "Afri. missionary (incl. if we don't know if Euro.)"
label var euroafrmissions2_yn "Euro. or Afri. missionary"
* We add information on whether the missionary was European. 
sort gridcell year
merge gridcell year using euro_panel
tab _m
drop _m
gen eurononmainyr = (promissions_yn == 1 & euroyr == 1 & euromainyr == 0)
replace eurononmainyr = . if euromainyr == .
gen afriyr = (promissions_yn == 1 & euroyr == 0)
replace afriyr = . if euroyr == .
gen afrimainyr = (promissions_yn == 1 & euromainyr == 0)
replace afrimainyr = . if euromainyr == .
label var eurononmainyr "Non-main Euro. station in t"
label var euromainyr "Main Euro. station in t"
label var afriyr "Afri. station in t"
label var afrimainyr "Afri. station in t (any station not main Euro.)"
egen gridnum = group(gridcell)
label var gridnum "Grid number"
save ghana_missions1, replace

********************
*** NIGHT LIGHTS ***
********************

*** NIGHT LIGHTS 1996-2011 (Radiance Calibrated) ***
* See the folder "Night Lights" for details. 
use "Night Lights\nlights_wotopcoding_byGhanagrid", clear
keep gridcell lli*mean
foreach X in 96 99 {
ren lli`X'_mean nltc19`X'
}
foreach X in 00 03 04 06 10 {
ren lli`X'_mean nltc20`X'
}
foreach X in 1011 {
ren lli`X'_mean nltc2011
}
sum nltc*
* We keep 2010
keep gridcell nltc2010
sort gridcell
save nl_ghana_corrtc, replace
sum nltc*

* We add  to the main data set.
use ghana_missions1, clear
sort gridcell
merge gridcell using nl_ghana_corrtc
tab _m
drop _m
foreach X in 2010 {
gen lnltc`X' = log(nltc`X'+1)
}
label var nltc2010 "NTL in 2010 (radiance calibrated)"
label var lnltc2010 "Log NTL in 2010 (radiance calibrated)"
save ghana_missions1, replace

****************************
*** ADDITIONAL VARIABLES ***
****************************

* Created in the folder "Additional Variables for Ghana"
use "Additional Variables for Ghana\ghana_grid_MissingVariables", clear
keep gridcell year_anthro gridvalue1900 gridvalue1924 polity polygamy missing_murdock dist2muslimce nevercolonised date_col dist2explorer rurc1900 urbc1900 slavery
sort gridcell
save temp, replace 

use ghana_missions1, clear
sort gridcell
merge gridcell using temp
tab _m
drop _m
gen yrstudy_yn = (year_anthro != 0)
label var yrstudy_yn "Dummy if know year of anthropologic survey in Murdock"
gen yrstudy1900 = (year_anthro >= 1830 & year_anthro < 1900)
gen yrstudy1924 = (year_anthro >= 1830 & year_anthro < 1924)
label var yrstudy1900 "Dummy if year of anthropological survey in Murdock < 1900"
label var yrstudy1924 "Dummy if year of anthropological survey in Murdock < 1924"
label var missing_murdock "Ethnicity not included in Murdock"
foreach X in dist2muslimcentre {
replace `X' = 5 if `X' <= 5
gen `X'10 = (`X' <= 10)
gen l`X' = log(`X')
}
gen centra = (polity == 2 | polity == 3 | polity == 4)
foreach X in dist2explorer {
replace `X' = 5 if `X' <= 5
gen `X'10 = (`X' <= 10)
gen l`X' = log(`X')
}
replace urbc1900 = 1 if urbc1900 == 0
replace rurc1900 = 1 if rurc1900 == 0
gen lurbc1900 = log(urbc1900)
gen lrurc1900 = log(rurc1900)
foreach Y in 1900 1924 {
replace gridvalue`Y' = 1 if gridvalue`Y' == 0 | gridvalue`Y' == .
}
foreach Y in 1900 1924 {
gen lgridvalue`Y' = log(gridvalue`Y')
}
label var lgridvalue1900 "Log 1900 Export values from all cash crops assigned to cell"
label var lgridvalue1924 "Log 1924 Export values from all cash crops assigned to cell"
save ghana_missions1, replace

* Beach 1903 data (protstants only)
use "Beach1903_grid.dta", clear
ren beach1900 beach03_pro_num
gen beach03_pro_yn = (beach03_pro_num > 0 & beach03_pro_num != .)
label var beach03_pro_yn "Dummy if Protestant Missions in 1903 (Beach Map)"
sort gridcell
save beach03_v2, replace

* Roome 1924 data (protestants and catholics)
* We do not include bfbs
use "roome24_grid.dta", clear
gen roome24_pro_yn = (roome24_protestant > 0 & roome24_protestant != .)
gen roome24_cat_yn = (roome24_catholics > 0 & roome24_catholics != .)
gen roome24_mis_yn = (roome24_pro_yn == 1 | roome24_cat_yn == 1)
egen roome24_mis_num = rsum(roome24_protestant roome24_catholics)
ren roome24_protestant roome24_pro_num 
ren roome24_catholics roome24_cat_num
drop roome24_bfbs
order gridcell roome24_mis_yn roome24_mis_num roome24_pro_yn roome24_pro_num roome24_cat_yn roome24_cat_num  
label var roome24_mis_yn "Dummy if Missions (Protestants & Catholics) in 1924 (Roome Map)"
label var roome24_mis_num "Number of Missions (Protestants & Catholics) in 1924 (Roome Map)"
label var roome24_pro_yn "Dummy if Protestant Missions in 1924 (Roome Map)"
label var roome24_cat_yn "Dummy if Catholic Missions in 1924 (Roome Map)"
sort gridcell
save roome24_v2, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using roome24_v2
tab _m
drop _m
sort gridcell
merge gridcell using beach03_v2
tab _m
drop _m
save ghana_missions1, replace

*****************************
*** DISTANCE TO A MISSION ***
*****************************

* Distance to missions 1900 and 1924 *
foreach X in 1900 1924 {
use ghana_missions1, clear
keep if year == `X'
keep gridcell missions_yn longitude latitude
keep if missions_yn
sort gridcell
save mis`X', replace
use ghana_missions2, clear
keep if year == `X'
keep gridcell longitude latitude
ren longitude longitude2
ren latitude latitude2
cross using mis`X'
geodist latitude2 longitude2 latitude longitude, gen(dist)
collapse (min) dist, by(gridcell)
keep gridcell dist
ren dist dist2mis`X'
sort gridcell
save dist2mis`X', replace
}

use ghana_missions1, clear
sort gridcell
merge gridcell using dist2mis1900
tab _m
drop _m
sort gridcell
merge gridcell using dist2mis1924
tab _m
drop _m
save ghana_missions1, replace

**********************
*** OTHER OUTCOMES ***
**********************

*** URBAN POPULATION ***

* From Jedwab & Moradi 2016.
use upop5000_2000, clear
sort gridcell
save upop5000_2000, replace

* We add to the data set. 
use ghana_missions1, clear
sort gridcell
merge gridcell using upop5000_2000
tab _m
drop _m
* Urban population in 2000
replace upop5000_2000 = 0 if upop5000_2000 == .
foreach X in 2000 {
gen lupop5000_`X' = log(upop5000_`X' + 1)
}
* Urban share in 2000 
foreach X in 2000 {
gen urbsh5000_`X' = upop5000_`X'/pop_`X'*100
replace urbsh5000_`X' = 0 if urbsh5000_`X' == .
}
save ghana_missions1, replace

*** EMPLOYMENT, RELIGION, EDUCATION, DEMOGRAPHICS ***

* From Jedwab & Moradi 2016
* See the folder "Data from the 2000 Census"
use census2000_gridlevel_outcomes, clear
keep gridcell manuf_sh fire_sh cogn_br_sh rel_chri yrseduc educ_prim_comp compfert3549 compnetfert3549
sort gridcell
save temp, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using temp, update
tab _m
drop _m
gen compmort3549 = compfert3549 - compnetfert3549
save ghana_missions1, replace

*** ANTHROPOMETRIC OUTCOMES ***

* From the DHS (in the folder "DHS")
use anthro_dhs, clear
keep gridcell zlen_5obscell zwei_5obscell
* We keep the length-for-age and weight-for-age Z-scores (when there at least 5 obs. per cells)
sort gridcell
save anthro_dhs2, replace

use ghana_missions1, clear
sort gridcell
merge gridcell using anthro_dhs2
tab _m
drop _m
save ghana_missions1, replace

****************************************************************************
*** LABELLING SOME VARIABLES AND FINALIZING THE CROSS-SECTIONAL DATA SET ***
****************************************************************************

use ghana_missions1, clear
keep gridcell year gridnum *3549 *5obscell asstschools_num mainstat_num dist2east31 dist2east31_030 lupop5000_* missions_num yrseduc educ_prim_comp rel_chri cogn_br_sh induserv_sh manuf_sh fire_sh pop_2000 urbsh5000_* dist2mis* yrstudy* mainstat_yn* asstschools_yn* *beach* *roome* minet_dist *gridvalue* polygamy slavery *urbc* *rurc* *dist2explorer *dist2muslimce* centra date* never* missing_murdock tsi yr_startup *nltc2010 station_roome lmissions_num asstschools_yn mainstat_yn station_beach dist2rail south dist2placebo_030 dist2iv_030 lnode_dist node_dist dist2rail1932 dist2west18_030 missions_yn* euroyr area_sqkm longitude latitude region_4 province_2000 district_2000 district31 ethnic density1800m prec_mean alt_mean alt_sd soil5 histo_malaria port1850_yn dist2coast ldist2coast nongcc1873 dist2riv_navigable10 traderoute18501890_dist10 exploroute188089_dist10 dist2rail193210 dist2class123_193010 city1800 map08_yn national headchief lupop_1891 lupop_1901 lrpop_1901 lupop_1931 lrpop_1931 ln_export_area slavemarket_strict_dist50 palmoil1900_dist50 kolanut0 rubber1900_dist50 cocoa minet_dist1932_50 missions_yn cmeth cpresb coth ccath dist2firstpresb dist2firstmetho dist2firstcatho upop_2000 rpop_2000 catho_yn upop_1931 rpop_1931 methmissions_yn presbmissions_yn promissions_* euromissions_yn afrmissions_yn euroafrmissions_yn afrmissions2_yn euroafrmissions2_yn afrimainyr euromainyr eurononmainyr afriyr afrimainyr
label var area_sqkm "Area (sq km)"
label var gridcell "Grid cell ID"
label var year "Year"
label var longitude "Longitude"
label var latitude "Latitude"
label var region_4 "Region (as of 1931; N = 4)" 
label var province_2000 "Province (as of 2000; N = 10)"
label var district_2000 "District (as of 2000; N = 109)"
label var district31 "District (as of 1931; N = 38)"
label var ethnic "Main ethnic group in Mudrock (N = 34)"
label var lupop_1891 "Log urban pop. (sum of pop. > 1K) 1891"
label var lupop_1901 "Log urban pop. (sum of pop. > 1K) 1901"
label var lupop_1931 "Log urban pop. (sum of pop. > 1K) 1931"
label var lrpop_1901 "Log rural pop. (sum of pop. < 1K) 1901"
label var lrpop_1931 "Log rural pop. (sum of pop. < 1K) 1931"
label var soil5 "Average soil fertility"
label var city1800 "Dummy Large pre-colonial city 1800"
label var national "Dummy largest and second largest cities 1901"
label var nongcc1873 "Dummy Gold Coast Colony established by the British c. 1850"
label var dist2coast "Distance to the coast"
label var ldist2coast "Log distance to the coast"
label var dist2riv_nav "Dummy if navigable river 10 Km"
label var traderoute18 "Dummy if Ashanti trade route 1850 10 Km"
label var exploroute18 "Dummy if non-Ashanti trade route 1850 10 Km"
label var dist2rail193210 "Dummy if Railroad 1932 10 Km"
label var dist2cla "Dummy if the cell was within 10 km from a road in 1930"
label var cocoa "Dummy if cocoa-producing cell 1927"
label var kolanut0 "Dummy if kola-producing cell 1932"
label var rubber1900_d "Dummy within 50 km from a rubber plantation in 1900-1936"
label var palmoil1900_ "Dummy if palm oil plantation 1900-1936 50 Km"
label var slavemarket_ "Dummy slave market 1800 50Km"
label var minet_dist19 "Dummy central location of a mine in 1932"
label var missions_yn "Dummy if mission in the cell in year t"
label var density1800m "Population density (est.) c. 1800"
label var dist2firstpresb "Distance to 1st Presbyterian mission"
label var dist2firstmetho "Distance to 1st Methodist mission"
label var dist2firstcatho "Distance to 1st Catholic mission"
label var catho_yn "Dummy if Catholic mission in the cell"
label var upop_2000 "Urban pop. (sum of pop. > 1K) 2000"
label var rpop_2000 "Urban pop. (sum of pop. < 1K) 2000"
foreach X in 1850 1854 1875 1897 {
label var missions_yn1850 "Mission dummy in `X'"
}
* Last mission variables * 
gen station_beach_yn = (station_beach >= 1)
label var station_beach_yn "Mission in Beach Atlas"
gen station_roome_yn = (station_roome >= 1)
label var station_roome_yn "Mission in Roome Atlas"
sort gridcell
merge gridcell using euro_cross
tab _m
drop _m
gen eurononmain4690 = (euro4690 == 1 & euromain4690 == 0)
replace eurononmain4690 = . if euro4690 == .
label var eurononmain4690 "European mission - not main station - 1846-1890"
* We create the early creation dummies
gen period1 = (yr_startup <= 1850)
replace period1 = . if yr_startup == .
gen period2 = (yr_startup > 1850 & yr_startup <= 1875)
replace period2 = . if yr_startup == .
replace period1 = 0 if missions_yn == 0
replace period2 = 0 if missions_yn == 0
label var yr_startup "First year with a mission"
label var period1 "Dummy if mission in first period"
label var period2 "Dummy if mission in second period"
gen yearmis = year if missions_yn == 1
bysort gridcell: egen firstyear = min(yearmis)
label var firstyear "First year with a mission"
gen early1_yn = (firstyear != . & firstyear <= 1850)
gen early2_yn = (firstyear != . & firstyear >= 1851 & firstyear <= 1875)
label var early1_yn "Dummy if mission in first period"
label var early2_yn "Dummy if mission in second period"
replace dist2mis1900 = 5 if dist2mis1900 == . | dist2mis1900 == 0
gen ldist2mis1900 = log(dist2mis1900)
replace dist2mis1924= 5 if dist2mis1924 == . | dist2mis1924 == 0
gen ldist2mis1924 = log(dist2mis1924)
label var ldist2mis1924 "Log dist. to mission 1924"
label var dist2mis1924 "Dist. to mission 1924"
label var ldist2mis1900 "Log dist. to mission 1900"
label var dist2mis1900 "Dist. to mission 1900"
foreach X in 1900 1924 {
gen missions_yn`X'_0_10 = (missions_yn == 1 | (dist2mis`X' >= 0 & dist2mis`X' <= 10))
gen missions_yn`X'_10_20 = (missions_yn == 0 & dist2mis`X' > 10 & dist2mis`X' <= 20)
gen missions_yn`X'_20_30 = (missions_yn == 0 & dist2mis`X' > 20 & dist2mis`X' <= 30)
}
foreach X in 1900 1924 {
label var missions_yn`X'_0_10 "Mission within 10 km in `X'"
label var missions_yn`X'_10_20 "Mission within 10-20 km in `X'"
label var missions_yn`X'_20_30 "Mission within 20-30 km in `X'"
}
* Other variables
replace pop_2000 = 0 if pop_2000 == .
gen lpop_2000 = log(pop_2000+1)
label var lpop_2000 "Log (total population + 1) in 2000"
gen dist2rail30 = (dist2rail <= 30)
label var dist2rail30 "Dummy within 30 km from railroad"
gen city10k_1900 = (gridcell == "W65" | gridcell == "AG60" | gridcell == "AT53")
gen minet_dist50 = (minet_dist <= 50)
gen mfgfire_sh = manuf_sh+fire_sh
label var city10k_1900 "Dummy if city of at least 10,000 c. 1900"
label var minet_dist50 "Mine within 50 km"
label var mfgfire_sh "Employment share of manufacturing & services"
label var asstschools_yn "Dummy if mission school"
label var asstschools_num "Number of mission schools"
label var missions_num "Number of missions"
label var lmissions_num "Log number of missions"
label var dist2rail1932 "Distance to railroad 1932"
label var dist2muslimcentre "Dist. to a Muslim centre"
label var dist2muslimcentre10 "Dummy if within 10 km from a Muslim centre"
label var ldist2muslimcentre "Log dist. to a Muslim centre"
label var ldist2explorer "Log dist. to explorer route"
label var lurbc1900 "Log urban pop. 1900 (HYDE 3.0)"
label var lrurc1900 "Log rural pop. 1900 (HYDE 3.0)"
label var lupop5000_2000 "Log urban pop. (loc. >= 5K) in 2000"
label var urbsh5000_2000 "Urban sh. (loc. >= 5K) in 2000"
label var cogn_br_sh "Employment share of cognitive occupations in 2000"
label var compfert3549 "Completed fertility rate for 35-49 women in 2000"
label var compnetfert3549 "Net completed fertility rate for 35-49 women in 2000"
label var compmort3549 "Mortality rate of children for 35-49 women in 2000"
label var zwei_5obscell "Avg weight-for-age Z-score in the DHS"
label var zlen_5obscell "Avg length-for-age Z-score in the DHS"
label var yearmis "First year with a mission"
label var centra "Centralization index in the Murdock atlas"
label var lnode_dist "Log Euclidean distance to a railroad node"
label var mainstat_yn "Dummy if main station"
label var missions_num "Number of mission stations"
label var dist2iv_030 "Within 30 km from the EMST-based IV"
label var dist2west18_030 "Within 30 km from the Western line"
label var dist2placebo_030 "Within 30 km from a placebo railroad line"
label var dist2east31 "Distance to the Eastern line"
label var dist2east31_030 "Within 30 km from the Eastern line"
foreach X in 1850 1875 1897 1900 1924 {
label var missions_yn`X' "Dummy if mission in `X'"
label var asstschools_yn`X' "Dummy if mission school in `X'"
label var mainstat_yn`X' "Dummy if main mission in `X'"
}
label var mission4690 "Mission 1846-90"
label var euro4690 "Euro. mission 1846-90"
label var euromain4690 "Euro. mission - main Euro. station - 1846-90"
label var eurononmain4690 "Euro. mission - not main Euro. station - 1846-90"
label var pop_2000 "Total population in 2000"
label var induserv_sh "Empl. sh. of indu. and serv. (%) for adults aged 25 or over in 2000"
label var manuf_sh "Share of manuf in pop of indu workers (%) in 2000"
label var fire_sh "Share of manuf in pop of indu workers (%) in 2000"
label var mfgfire_sh "Employment share of manufacturing & services in 2000"
label var cogn_br_sh "Employment share of cognitive occupations in 2000"
label var yrseduc "Mean years of schooling (primary and above) in 2000"
label var educ_prim_comp "Share of 18+ pop. having completed primary (%) in 2000"
label var rel_chri "Share of Catholics + Protestants in total pop. (%) in 2000"
* We drop a few more variables
drop south euro*1852 mis*1852 euro*1877 mis*1877 euro*1889 mis*1889  
* We re-order them. 
order gridcell gridnum year longitude latitude area_sqkm region_4 province_2000 district_2000 district31 ethnic missions_yn missions_yn1850-missions_yn1924 missions_yn1900_* missions_yn1924_* missions_num lmissions_num mainstat_yn asstschools_yn asstschools_num asstschools_yn1924 mainstat_yn1924 *dist2mis1900 *dist2mis1924 yearmis yr_startup firstyear promissions_yn promissions_num methmissions_yn presbmissions_yn cpresb-coth catho_yn dist2firstpresb dist2firstmetho dist2firstcatho euromissions_yn-euroafrmissions2_yn euromainyr-afrimainyr *4690 station_* beach03* roome24* period1 period2 early1* early2* dist2rail dist2rail30 dist2rail1932* dist2west18 dist2east* *placebo* dist2iv node_dist lnode_dist cocoa kolanut* rubber* palmoil* *gridvalue* *nltc2010 lupop5000_2000 urbsh5000_2000 upop_2000 rpop_2000 pop_2000 lpop_2000 induserv_sh manuf_sh fire_sh mfgfire_sh cogn_br_sh yrseduc educ_prim_comp rel_chri comp*3549 z*_5obscell alt_mean alt_sd *dist2coast histo_malaria tsi prec_mean soil5 *dist2riv* *traderoute* dist2class* *ln_export_area* port1850_yn nongcc1873 map08_yn headchief08 lupop_1891 l*pop_1901 *pop_1931 city1800 density1800m *rurc1900 *urbc1900 city10k_1900 national minet_* nevercolonised date_col centra slavery slavemarket* polygamy missing_murdock yrstudy* *explorer* *exploroute* *muslim* 
save ghana_missions2, replace

****************************************
*** CASH CROP VARIABLES AND DATA SET ***
****************************************

** GAEZ suitability **
clear
import excel "gh_gaez2091grids.xlsx", sheet("gh_cocoa") firstrow
ren cocoa gaez_cocoa
ren palmoil gaez_palm
ren grid gridcell
sort gridcell
save gaez_suit, replace

** Main database **
set matsize 10000
use ghana_missions1, clear
keep if year >= 1846 & year <= 1932
* We add GAEZ suitability. 
sort gridcell
merge gridcell using gaez_suit
tab _m
drop _m
* We add commodity exports. 
sort year
merge year using commodity_exports
tab _m
drop _m
* Cash crop value based on production dummies 
* Number of producing cells for each crop. 
tab cocoa if year == 1932
* 289
tab rubber1900_dist50 if year == 1932
* 535
tab palmoil1900_dist50 if year == 1932
* 284
tab kolanut0 if year == 1932
* 175
* Total cash crop value. 
foreach X in gh uk ghuk {
gen cashcval_`X' = cocoa*cocoa_v1_i_`X'/289 + rubber1900_dist50*rubber_v1_i_`X'/535 + palmoil1900_dist50*palmko_v1_i_`X'/284 + kolanut0*kola_v1_i_`X'/175
sum cashcval_`X'
gen lcashcval_`X' = log(cashcval_`X'+1)
label var lcashcval_`X' "Log cash crop value (deflator: `X')"
}
* For each crop at a time. 
foreach X in gh uk ghuk {
gen cocoapval_`X' = cocoa*cocoa_v1_i_`X'/289
sum cocoapval_`X'
gen lcocoapval_`X' = log(cocoapval_`X'+1)
}
foreach X in gh uk ghuk {
gen palmpval_`X' = palmoil1900_dist50*palmko_v1_i_`X'/284
gen lpalmpval_`X' = log(palmpval_`X'+1)
}
foreach X in gh uk ghuk {
gen rubbpval_`X' = rubber1900_dist50*rubber_v1_i_`X'/535
gen lrubbpval_`X' = log(rubbpval_`X'+1)
}
foreach X in gh uk ghuk {
gen kolapval_`X' = kolanut0*kola_v1_i_`X'/175
gen lkolapval_`X' = log(kolapval_`X'+1)
}
* Based on suitability * 
tab palmoilbelt0 if year == 1932, m
* 155
tab forest if year == 1932, m
* 554
tab suitable if year == 1932, m
* 684
foreach X in gh uk ghuk {
gen cashcsval_`X' = suitable*cocoa_v1_i_`X'/684 + forest*rubber_v1_i_`X'/554 + palmoilbelt0*palmko_v1_i_`X'/155
gen lcashcsval_`X' = log(cashcsval_`X'+1)
label var cashcsval_`X' "Crop value based on suitability (deflator: `X')"
label var lcashcsval_`X' "Log crop value based on suitability (deflator: `X')"
}
* GAEZ suitability for cocoa and palm oil 
gen gaez_cocoa_yn = (gaez_cocoa > 0 & gaez_cocoa != .)
gen gaez_palm_yn = (gaez_palm > 0 & gaez_palm != .)
tab gaez_palm_yn if year == 1932, m
* 1092
tab forest if year == 1932, m
* 554
tab gaez_cocoa_yn if year == 1932, m
* 1170
foreach X in gh uk ghuk {
gen cashcs2val_`X' = gaez_cocoa_yn*cocoa_v1_i_`X'/1170 + forest*rubber_v1_i_`X'/554 +gaez_palm_yn*palmko_v1_i_`X'/1092
gen lcashcs2val_`X' = log(cashcs2val_`X'+1)
label var cashcs2val_`X' "Crop value based on GAEZ suit. (deflator: `X')"
label var lcashcs2val_`X' "Log crop value based on GAEZ suit. (deflator: `X')"
}
* Cocoa only (historical data) *
foreach X in gh uk ghuk {
gen cashcocsval_`X' = suitable*cocoa_v1_i_`X'/684 
gen lcashcocsval_`X' = log(cashcocsval_`X'+1)
label var cashcocsval_`X' "Crop value based on cocoa suit. (deflator: `X')"
label var lcashcocsval_`X' "Log crop value based on cocoa suit. (deflator: `X')"
}
* Cocoa suitability based on different types of cocoa soils.
gen suitablecocoa = suitable
tab suitable highsuit
replace suitable = 0 if highsuit == 1 | vhighsuit == 1
replace highsuit = 0 if vhighsuit == 1
tab suitable if year == 1932
* 228 
tab highsuit  if year == 1932
* 412
tab vhighsuit  if year == 1932
* 44
* Yields for each = 223, 1338, 2000
* See text for details
gen sumyields = suitable*223 + highsuit*1338 + vhighsuit*2000
bysort year: egen sumsumyields = sum(sumyields)
sum sumsumyields
* 690,100
gen share = sumyields/690100
* Cocoa based on best
foreach X in gh uk ghuk {
gen cashco2csval_`X' = cocoa_v1_i_`X'*share 
gen lcashco2csval_`X' = log(cashco2csval_`X'+1)
label var cashco2csval_`X' "Crop value based on cocoa suit. (deflator: `X')"
label var lcashco2csval_`X' "Log crop value based on cocoa suit. (deflator: `X')"
}
* Additional variables for the regressions *
sort gridcell year
foreach Z in cashc palmp rubbp kolap cocoap cashcs cashcs2 cashcocs cashco2cs {
foreach X in gh uk ghuk {
foreach Y in 1 {
bysort gridcell: gen lag`Y'l`Z'val_`X' = l`Z'val_`X'[_n-`Y']
label var lag`Y'l`Z'val_`X' "Lag of `Y'l`Z'val_`X'"
bysort gridcell: gen lead`Y'l`Z'val_`X' = l`Z'val_`X'[_n+`Y']
label var lead`Y'l`Z'val_`X' "Lead of `Y'l`Z'val_`X'"
}
}
}
* Additional mission variables *
gen mainline = ((cmeth > 0 & cmeth != 0) | (cpresb > 0 & cpresb != 0))
gen catho = (ccath > 0 & ccath != 0)
gen otprot = (coth > 0 & coth != 0)
* We keep some variables only. 
keep gridcell gridnum year palmpval_uk rubbpval_uk kolapval_uk cocoapval_uk cashcval_uk cashcsval_uk missions_yn *l*val_* suitablecocoa longitude latitude ethnic district31 lmissions_num mainstat_yn asstschools_yn mainline otprot catho 
sort gridnum year 
bysort gridnum: gen lag1missions_yn = missions_yn[_n-1]
label var missions_yn "Dummy if mission"
label var mainstat_yn "Dummy if main station"
label var asstschools_yn "Dummy if mission school"
label var district31 "District FE (1931)"
label var lmissions_num "Log number of missions"
label var mainline "Mainline protestant mission dummy"
label var catho "Catholic mission dummy"
label var otprot "Other protestant mission dummy"
label var lag1missions_yn "Lag of the mission dummy"
order gridcell gridnum year longitude latitude district31 ethnic missions_yn mainstat_yn asstschools_yn lmissions_num mainline catho otprot lag1missions_yn
label var cashcval_uk "Cash crop value (deflator: uk)"
drop suitablecocoa
foreach Z in cocoa palm rubb kola {
foreach X in uk {
label var `Z'pval_`X' "Crop value for `Z' (deflator: `X')"
label var l`Z'pval_`X' "Log crop value for `Z' (deflator: `X')"
}
}
foreach Z in palm kola {
foreach X in ghuk {
label var `Z'pval_`X' "Crop value for `Z' (deflator: `X')"
}
}
foreach Z in kola {
foreach X in gh {
label var `Z'pval_`X' "Crop value for `Z' (deflator: `X')"
}
}
foreach Z in cocoa palm rubb kola {
foreach X in ghuk gh {
label var l`Z'pval_`X' "Crop value for `Z' (deflator: `X')"
}
}
label var palmpval_gh  "Crop value for palm (deflator: gh)"
sort gridnum year 
save temp_cashc, replace

************************************************************
*** DATA SET FOR THE PANEL-EVENT STUDY FOR THE RAILROADS ***
************************************************************

* For this analysis, the exact timing of opening is particularly important. 
* For the main analysis, we rely on the timing provided by Jedwab & Moradi 2016 (JM16). 
* However, for this analysis, we refine the variable capturing the timing of railroad opening, using the same sources as Jedwab & Moradi 2016.
* The main source is Luntinen, Pertti. 1996. Railway on the Gold Coast: A Meeting of Two Cultures : a Colonial History. Helsinki: Suomalainen. Tiedeakatemia.
* Not modifying the main timing variable for the main analysis should if anything lead to more conservative estimates given classical measurement error, so we do not modify the main analysis. 
use ghana_missions2, clear
* We create some railroad variables and only keep some variables to make the sample smaller. 
gen rail030 = (dist2rail <= 30 & dist2rail != .)
sort gridcell year
foreach X of numlist 1(1)3 {
bysort gridcell: gen lag`X'rail030 = rail030[_n-`X']
bysort gridcell: gen lead`X'rail030 = rail030[_n+`X']
}
gen ethnicyr = ethnic+string(year)
gen distyr = district31+string(year)
keep gridcell gridnum region_4 dist2placebo_030 dist2rail dist2west18_030 dist2east31_030 year missions_yn *rail030 longitude latitude ethnic district31 ethnicyr distyr
sort gridcell
merge gridcell using lines 
tab _m
drop _m
* We create some variables for the event study
* In particular, we create "yearopen", the year each cell with a railroad was "opened"
bysort gridnum: egen maxrail030 = max(rail030)
tab maxrail030 if year == 1932
gen year_with_rail1 = year if rail030 == 1
bysort gridnum: egen yearopen = min(year_with_rail)

* We create the variables for the Western line, then the Eastern line, then the Central line.
* WESTERN LINE *
gen rail030_west = (dist2west18_030 == 1)
drop dist2west18_030
bysort gridcell: egen maxrail030_west = max(rail030_west)
drop rail030_west
ren maxrail030_west rail030_west
tab year rail030_west
tab yearopen if rail030_west == 1
* 1901-1923 
* The segment open in 1923 in JM16 was actually opened in 1903.
* Indeed, it is the line from Sekondi to Kumasi. A few cells around Kumasi were "connected" as early as 1903 (see Figure A3), so not 1923 via the Eastern line. 
replace yearopen = 1903 if rail030_west == 1 & yearopen == 1923
tab yearopen if rail030_west == 1
* 1901-1903 now
* EASTERN LINE *
* We then recreate the Eastern line dummy, since some cells were transfered from the Eastern line to the Western line. 
gen rail030_1923 = (dist2line18 <= 30 | dist2line1823 <= 30)
gen rail030_east = (rail030_1923 == 1 & rail030_west == 0)
* There is one inconsistency between the 30 km Eastern rail dummy and the 30 km  rail dummy (since this has to do with the fact that two different GIS softwares were used to create each).
* In that case, we follow the main railroad line variable. 
replace rail030_east = 0 if rail030_east == 1 & rail030 == 0 
bysort gridcell: egen maxrail030_east = max(rail030_east)
drop rail030_east
ren maxrail030_east rail030_east
tab yearopen if rail030_east == 1 & year == 1932, m
* CENTRAL LINE *
gen rail030_cent = (rail030 == 1 & rail030_east == 0 & rail030_west == 0)
bysort gridcell: egen maxrail030_cent = max(rail030_cent)
drop rail030_cent
ren maxrail030_cent rail030_cent

* We now re-use Luntinen to improve the timing variable
** Eastern line **
* The first segment was reached in 1909 not in 1910
* "The construction of the Accra-Mangoasi line was given to contractors, i.e. it was not built by the Construction Department of the Government which had made the western line. The first sod was cut by the Governor on January the 7th, 1909, with due ceremony."
tab yearopen if rail030_east == 1
replace yearopen = 1909 if yearopen == 1910 & rail030_east == 1
* Likewise, for the railroad to Mangoasi in 1914, construction started in 1913 and the line reached Koforidua in February 1915. Mangoasi was very likely reached in 1914 instead of 1915.
tab yearopen if rail030_east == 1
replace yearopen = 1914 if yearopen == 1915 & rail030_east == 1

* We create the variable for the number of years since a line was opened
gen yearafteropen = year-yearopen

* We keep if post 1870.
keep if year >= 1870
egen distyrnum = group(distyr)

* We create the "post" dummies. 
foreach X of numlist 0(1)14 {
gen open`X' = 0
replace open`X' = 1 if yearafteropen == `X'
label var open`X' "Open `X' years after"
}
gen open15 = 0 
replace open15 = 1 if yearafteropen >= 15 & yearafteropen != .
label var open15 "Open 15 years after"
* We create the "pre" dummies. 
foreach X of numlist 1(1)14 {
gen openpre`X' = 0
replace openpre`X' = 1 if yearafteropen == -`X'
label var openpre`X' "Open `X' years before"
}
gen openpre15 = 0 
replace openpre15 = 1 if yearafteropen <= -15 & yearafteropen != .
label var openpre15 "Open 15 years before"

* We only keep some variables to make the sample smaller.
keep missions_yn district31 dist2placebo_030 open* rail030 rail030*west rail030*east rail030*cent distyrnum gridnum yearopen* year latitude longitude
sort gridnum year
* We keep the cells close (within 30 km) to a railroad or a placebo line. 
bysort gridnum: egen maxrail030 = max(rail030)
bysort gridnum: egen maxdist2placebo_030 = max(dist2placebo_030)
keep if maxrail030 == 1 | maxdist2placebo_030 == 1
drop maxrail030 maxdist2placebo_030 dist2placeb*
* We label the variables
label var rail030 "Dummy if railroad open within 30 km t"
label var yearopen "Year the line was opened"
label var rail030_west "Dummy if Western line within 30 km t"
label var rail030_east "Dummy if Eastern line within 30 km t"
label var rail030_cent "Dummy if Central line within 30 km t"
order gridnum year longitude latitude district31 distyrnum missions_yn
save sample_event_study_rail, replace

**************************************************
*** DELETE LARGE TEMPORARY FILES WE DON'T NEED ***
**************************************************

capture erase "ghana_missions1.dta"
capture erase "temp_crops.dta"
capture erase "temp.dta"
capture erase "west_plac_iv.dta"
capture erase "east.dta"
